import pandas as pd
data=pd.read_excel('Capstone_final_dataset.xlsx')
data.head()
time0_year | first_tx | race | black | SEX | age | above74 | age_cat | ABIRATERONE | enza | ... | a1c_cat | cindex_b_Romano | char_cat | Total_Elixhauser_Groups | DOCETAXEL_bf_time0 | fi_score_cat | dis_number | fi_score | frail | gleason_reviewed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017 | ENZALUTA | White | 0 | M | 70 | 0 | 3 | 0 | 1 | ... | 1. <5.6 | 2 | 1 | 3 | 0 | 2. pre-f | 4 | 0.129032 | 0 | 9 |
1 | 2015 | ABIRATER | White | 0 | M | 68 | 0 | 2 | 1 | 0 | ... | 1. <5.6 | 4 | 2 | 7 | 0 | 1. non-f | 3 | 0.096774 | 0 | 7 |
2 | 2015 | ABIRATER | White | 0 | M | 71 | 0 | 3 | 1 | 0 | ... | 1. <5.6 | 3 | 1 | 5 | 0 | 1. non-f | 3 | 0.096774 | 0 | |
3 | 2015 | ABIRATER | White | 0 | M | 79 | 1 | 3 | 1 | 0 | ... | 4. >=7.2 | 7 | 3 | 10 | 0 | 3. mild | 9 | 0.290323 | 1 | 9 |
4 | 2016 | ABIRATER | White | 0 | M | 90 | 1 | 5 | 1 | 0 | ... | NaN | 3 | 1 | 6 | 0 | 4. moder | 10 | 0.322581 | 1 |
5 rows × 32 columns
data.shape
(5822, 32)
data.isnull().sum()
0 | |
---|---|
time0_year | 0 |
first_tx | 0 |
race | 0 |
black | 0 |
SEX | 0 |
age | 0 |
above74 | 0 |
age_cat | 0 |
ABIRATERONE | 0 |
enza | 0 |
first_tx_last_day_Supply | 0 |
days_bt_first_last_prescrib | 0 |
first_tx_daysSupply_sum | 0 |
fu_end_date_year | 0 |
death | 0 |
crcl_cat | 0 |
albumin_cat | 0 |
bilirubin_cat | 0 |
hgb_cat | 0 |
psa_cat | 0 |
PSACAT | 0 |
BMI_cat | 0 |
a1c_cat | 2401 |
cindex_b_Romano | 0 |
char_cat | 0 |
Total_Elixhauser_Groups | 0 |
DOCETAXEL_bf_time0 | 0 |
fi_score_cat | 0 |
dis_number | 0 |
fi_score | 0 |
frail | 0 |
gleason_reviewed | 0 |
data.drop('a1c_cat', axis=1, inplace=True)
data.head()
time0_year | first_tx | race | black | SEX | age | above74 | age_cat | ABIRATERONE | enza | ... | BMI_cat | cindex_b_Romano | char_cat | Total_Elixhauser_Groups | DOCETAXEL_bf_time0 | fi_score_cat | dis_number | fi_score | frail | gleason_reviewed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017 | ENZALUTA | White | 0 | M | 70 | 0 | 3 | 0 | 1 | ... | 2 | 2 | 1 | 3 | 0 | 2. pre-f | 4 | 0.129032 | 0 | 9 |
1 | 2015 | ABIRATER | White | 0 | M | 68 | 0 | 2 | 1 | 0 | ... | 4 | 4 | 2 | 7 | 0 | 1. non-f | 3 | 0.096774 | 0 | 7 |
2 | 2015 | ABIRATER | White | 0 | M | 71 | 0 | 3 | 1 | 0 | ... | 4 | 3 | 1 | 5 | 0 | 1. non-f | 3 | 0.096774 | 0 | |
3 | 2015 | ABIRATER | White | 0 | M | 79 | 1 | 3 | 1 | 0 | ... | 2 | 7 | 3 | 10 | 0 | 3. mild | 9 | 0.290323 | 1 | 9 |
4 | 2016 | ABIRATER | White | 0 | M | 90 | 1 | 5 | 1 | 0 | ... | 4 | 3 | 1 | 6 | 0 | 4. moder | 10 | 0.322581 | 1 |
5 rows × 31 columns
data.isnull().sum()
0 | |
---|---|
time0_year | 0 |
first_tx | 0 |
race | 0 |
black | 0 |
SEX | 0 |
age | 0 |
above74 | 0 |
age_cat | 0 |
ABIRATERONE | 0 |
enza | 0 |
first_tx_last_day_Supply | 0 |
days_bt_first_last_prescrib | 0 |
first_tx_daysSupply_sum | 0 |
fu_end_date_year | 0 |
death | 0 |
crcl_cat | 0 |
albumin_cat | 0 |
bilirubin_cat | 0 |
hgb_cat | 0 |
psa_cat | 0 |
PSACAT | 0 |
BMI_cat | 0 |
cindex_b_Romano | 0 |
char_cat | 0 |
Total_Elixhauser_Groups | 0 |
DOCETAXEL_bf_time0 | 0 |
fi_score_cat | 0 |
dis_number | 0 |
fi_score | 0 |
frail | 0 |
gleason_reviewed | 0 |
import seaborn as sns
import matplotlib.pyplot as plt
# Check unique values in 'first_tx'
print(data['first_tx'].unique())
['ENZALUTA' 'ABIRATER']
# Mapping categorical values to numeric
data['first_tx'] = data['first_tx'].map({'ENZALUTA': 1, 'ABIRATER': 0})
data.head()
time0_year | first_tx | race | black | SEX | age | above74 | age_cat | ABIRATERONE | enza | ... | BMI_cat | cindex_b_Romano | char_cat | Total_Elixhauser_Groups | DOCETAXEL_bf_time0 | fi_score_cat | dis_number | fi_score | frail | gleason_reviewed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017 | 1 | White | 0 | M | 70 | 0 | 3 | 0 | 1 | ... | 2 | 2 | 1 | 3 | 0 | 2. pre-f | 4 | 0.129032 | 0 | 9 |
1 | 2015 | 0 | White | 0 | M | 68 | 0 | 2 | 1 | 0 | ... | 4 | 4 | 2 | 7 | 0 | 1. non-f | 3 | 0.096774 | 0 | 7 |
2 | 2015 | 0 | White | 0 | M | 71 | 0 | 3 | 1 | 0 | ... | 4 | 3 | 1 | 5 | 0 | 1. non-f | 3 | 0.096774 | 0 | |
3 | 2015 | 0 | White | 0 | M | 79 | 1 | 3 | 1 | 0 | ... | 2 | 7 | 3 | 10 | 0 | 3. mild | 9 | 0.290323 | 1 | 9 |
4 | 2016 | 0 | White | 0 | M | 90 | 1 | 5 | 1 | 0 | ... | 4 | 3 | 1 | 6 | 0 | 4. moder | 10 | 0.322581 | 1 |
5 rows × 31 columns
# Check unique values in 'race'
print(data['race'].unique())
['White' 'Black' 'Other' 'Unknown']
# Mapping categorical values to numeric
data['race'] = data['race'].map({'White': 1, 'Black' : 0, 'Other': 3, 'Unknown': 4})
data.head()
time0_year | first_tx | race | black | SEX | age | above74 | age_cat | ABIRATERONE | enza | ... | BMI_cat | cindex_b_Romano | char_cat | Total_Elixhauser_Groups | DOCETAXEL_bf_time0 | fi_score_cat | dis_number | fi_score | frail | gleason_reviewed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017 | 1 | 1 | 0 | M | 70 | 0 | 3 | 0 | 1 | ... | 2 | 2 | 1 | 3 | 0 | 2. pre-f | 4 | 0.129032 | 0 | 9 |
1 | 2015 | 0 | 1 | 0 | M | 68 | 0 | 2 | 1 | 0 | ... | 4 | 4 | 2 | 7 | 0 | 1. non-f | 3 | 0.096774 | 0 | 7 |
2 | 2015 | 0 | 1 | 0 | M | 71 | 0 | 3 | 1 | 0 | ... | 4 | 3 | 1 | 5 | 0 | 1. non-f | 3 | 0.096774 | 0 | |
3 | 2015 | 0 | 1 | 0 | M | 79 | 1 | 3 | 1 | 0 | ... | 2 | 7 | 3 | 10 | 0 | 3. mild | 9 | 0.290323 | 1 | 9 |
4 | 2016 | 0 | 1 | 0 | M | 90 | 1 | 5 | 1 | 0 | ... | 4 | 3 | 1 | 6 | 0 | 4. moder | 10 | 0.322581 | 1 |
5 rows × 31 columns
# Check unique values in 'sex'
print(data['SEX'].unique())
['M']
data.drop('SEX', axis=1, inplace=True)
data.head()
time0_year | first_tx | race | black | age | above74 | age_cat | ABIRATERONE | enza | first_tx_last_day_Supply | ... | BMI_cat | cindex_b_Romano | char_cat | Total_Elixhauser_Groups | DOCETAXEL_bf_time0 | fi_score_cat | dis_number | fi_score | frail | gleason_reviewed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017 | 1 | 1 | 0 | 70 | 0 | 3 | 0 | 1 | 30 | ... | 2 | 2 | 1 | 3 | 0 | 2. pre-f | 4 | 0.129032 | 0 | 9 |
1 | 2015 | 0 | 1 | 0 | 68 | 0 | 2 | 1 | 0 | 90 | ... | 4 | 4 | 2 | 7 | 0 | 1. non-f | 3 | 0.096774 | 0 | 7 |
2 | 2015 | 0 | 1 | 0 | 71 | 0 | 3 | 1 | 0 | 30 | ... | 4 | 3 | 1 | 5 | 0 | 1. non-f | 3 | 0.096774 | 0 | |
3 | 2015 | 0 | 1 | 0 | 79 | 1 | 3 | 1 | 0 | 28 | ... | 2 | 7 | 3 | 10 | 0 | 3. mild | 9 | 0.290323 | 1 | 9 |
4 | 2016 | 0 | 1 | 0 | 90 | 1 | 5 | 1 | 0 | 30 | ... | 4 | 3 | 1 | 6 | 0 | 4. moder | 10 | 0.322581 | 1 |
5 rows × 30 columns
# Check unique values in 'fi_score_cat'
print(data['fi_score_cat'].unique())
['2. pre-f' '1. non-f' '3. mild' '4. moder' '5. sever']
# Mapping categorical values to numeric
data['fi_score_cat'] = data['fi_score_cat'].map({'1. non-f': 0, '2. pre-f' : 1, '3. mild': 2, '4. moder': 3,'5. sever':5})
data.head()
time0_year | first_tx | race | black | age | above74 | age_cat | ABIRATERONE | enza | first_tx_last_day_Supply | ... | BMI_cat | cindex_b_Romano | char_cat | Total_Elixhauser_Groups | DOCETAXEL_bf_time0 | fi_score_cat | dis_number | fi_score | frail | gleason_reviewed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017 | 1 | 1 | 0 | 70 | 0 | 3 | 0 | 1 | 30 | ... | 2 | 2 | 1 | 3 | 0 | 1 | 4 | 0.129032 | 0 | 9 |
1 | 2015 | 0 | 1 | 0 | 68 | 0 | 2 | 1 | 0 | 90 | ... | 4 | 4 | 2 | 7 | 0 | 0 | 3 | 0.096774 | 0 | 7 |
2 | 2015 | 0 | 1 | 0 | 71 | 0 | 3 | 1 | 0 | 30 | ... | 4 | 3 | 1 | 5 | 0 | 0 | 3 | 0.096774 | 0 | |
3 | 2015 | 0 | 1 | 0 | 79 | 1 | 3 | 1 | 0 | 28 | ... | 2 | 7 | 3 | 10 | 0 | 2 | 9 | 0.290323 | 1 | 9 |
4 | 2016 | 0 | 1 | 0 | 90 | 1 | 5 | 1 | 0 | 30 | ... | 4 | 3 | 1 | 6 | 0 | 3 | 10 | 0.322581 | 1 |
5 rows × 30 columns
# Check unique values in 'psa_cat'
print(data['psa_cat'].unique())
['Cat5. 50' 'Cat2. 4' 'Cat3. 10' 'Cat4. 20' 'Cat7. 20' 'Cat1. 0' 'Cat6. 10' 'Unknown']
# Mapping categorical values to numeric
data['psa_cat'] = data['psa_cat'].map({'Cat5. 50':5, 'Cat2. 4':2, 'Cat3. 10':3, 'Cat4. 20':4, 'Cat7. 20':7, 'Cat1. 0':1,
'Cat6. 10':6, 'Unknown':0})
data.head()
time0_year | first_tx | race | black | age | above74 | age_cat | ABIRATERONE | enza | first_tx_last_day_Supply | ... | BMI_cat | cindex_b_Romano | char_cat | Total_Elixhauser_Groups | DOCETAXEL_bf_time0 | fi_score_cat | dis_number | fi_score | frail | gleason_reviewed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017 | 1 | 1 | 0 | 70 | 0 | 3 | 0 | 1 | 30 | ... | 2 | 2 | 1 | 3 | 0 | 1 | 4 | 0.129032 | 0 | 9 |
1 | 2015 | 0 | 1 | 0 | 68 | 0 | 2 | 1 | 0 | 90 | ... | 4 | 4 | 2 | 7 | 0 | 0 | 3 | 0.096774 | 0 | 7 |
2 | 2015 | 0 | 1 | 0 | 71 | 0 | 3 | 1 | 0 | 30 | ... | 4 | 3 | 1 | 5 | 0 | 0 | 3 | 0.096774 | 0 | |
3 | 2015 | 0 | 1 | 0 | 79 | 1 | 3 | 1 | 0 | 28 | ... | 2 | 7 | 3 | 10 | 0 | 2 | 9 | 0.290323 | 1 | 9 |
4 | 2016 | 0 | 1 | 0 | 90 | 1 | 5 | 1 | 0 | 30 | ... | 4 | 3 | 1 | 6 | 0 | 3 | 10 | 0.322581 | 1 |
5 rows × 30 columns
import pandas as pd
# Assuming df is your DataFrame
data['gleason_reviewed'] = data['gleason_reviewed'].fillna(0)
data.isnull().sum()
0 | |
---|---|
time0_year | 0 |
first_tx | 0 |
race | 0 |
black | 0 |
age | 0 |
above74 | 0 |
age_cat | 0 |
ABIRATERONE | 0 |
enza | 0 |
first_tx_last_day_Supply | 0 |
days_bt_first_last_prescrib | 0 |
first_tx_daysSupply_sum | 0 |
fu_end_date_year | 0 |
death | 0 |
crcl_cat | 0 |
albumin_cat | 0 |
bilirubin_cat | 0 |
hgb_cat | 0 |
psa_cat | 0 |
PSACAT | 0 |
BMI_cat | 0 |
cindex_b_Romano | 0 |
char_cat | 0 |
Total_Elixhauser_Groups | 0 |
DOCETAXEL_bf_time0 | 0 |
fi_score_cat | 0 |
dis_number | 0 |
fi_score | 0 |
frail | 0 |
gleason_reviewed | 0 |
# Replace empty strings with NaN
import numpy as np
data = data.replace(' ', np.nan)
<ipython-input-76-5a78547b3815>:3: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` data = data.replace(' ', np.nan)
import pandas as pd
# Assuming df is your DataFrame
data['gleason_reviewed'] = data['gleason_reviewed'].fillna(0)
# Check unique values in 'gleason_reviewed'
print(data['gleason_reviewed'].unique())
[ 9. 7. 0. 8. 6. 10. 5. 2. 4. 3.]
# Calculate the correlation matrix
correlation_matrix = data.corr()
# Create a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Set the aesthetics for the plots
sns.set(style='whitegrid')
# List of all columns in the DataFrame excluding 'death'
columns_to_visualize = data.columns.drop('death')
# Create a figure to visualize the relationship with 'death'
plt.figure(figsize=(20, 25))
# Loop through each column and create plots
for i, col in enumerate(columns_to_visualize):
plt.subplot(6, 6, i + 1) # Adjust the number of rows and columns for the grid
if data[col].nunique() < 20: # Categorical variable
sns.countplot(x='death', hue=col, data=data, palette='viridis')
plt.title(f'Death vs {col}')
else: # Numeric variable
sns.boxplot(x='death', y=col, data=data, palette='viridis')
plt.title(f'Death vs {col}')
plt.xlabel('Death (1 = Yes, 0 = No)')
plt.ylabel(col)
plt.tight_layout()
plt.show()
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='death', y=col, data=data, palette='viridis') <ipython-input-80-4ca33512ba3f>:19: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='death', y=col, data=data, palette='viridis') <ipython-input-80-4ca33512ba3f>:19: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='death', y=col, data=data, palette='viridis') <ipython-input-80-4ca33512ba3f>:19: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='death', y=col, data=data, palette='viridis') <ipython-input-80-4ca33512ba3f>:19: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='death', y=col, data=data, palette='viridis') <ipython-input-80-4ca33512ba3f>:19: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='death', y=col, data=data, palette='viridis') <ipython-input-80-4ca33512ba3f>:19: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='death', y=col, data=data, palette='viridis')
import matplotlib.pyplot as plt
pd.plotting.scatter_matrix(data, figsize=(50, 50))
plt.show()
print(data.columns)
Index(['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat', 'ABIRATERONE', 'enza', 'first_tx_last_day_Supply', 'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum', 'fu_end_date_year', 'death', 'crcl_cat', 'albumin_cat', 'bilirubin_cat', 'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano', 'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0', 'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed'], dtype='object')
import matplotlib.pyplot as plt
# Grouping the data by the features and calculating the mean of 'death'
enza_death = data.groupby('enza')['death'].mean()
abiraterone_death = data.groupby('ABIRATERONE')['death'].mean()
psa_cat_death = data.groupby('psa_cat')['death'].mean()
# Plotting death vs. enza
plt.figure(figsize=(12, 6))
plt.plot(enza_death.index, enza_death.values, marker='o', label='Death vs Enza')
plt.title('Line Graph: Death vs Enza')
plt.xlabel('Enza (0 or 1)')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()
# Plotting death vs. ABIRATERONE
plt.figure(figsize=(12, 6))
plt.plot(abiraterone_death.index, abiraterone_death.values, marker='o', color='orange', label='Death vs ABIRATERONE')
plt.title('Line Graph: Death vs ABIRATERONE')
plt.xlabel('ABIRATERONE (0 or 1)')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()
# Plotting death vs. psa_cat
plt.figure(figsize=(12, 6))
plt.plot(psa_cat_death.index, psa_cat_death.values, marker='o', color='green', label='Death vs PSA Category')
plt.title('Line Graph: Death vs PSA Category')
plt.xlabel('PSA Category')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()
LINEAR REGRESSION
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
#Selecting required attributes for analysis
X = data[['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat',
'ABIRATERONE', 'enza', 'first_tx_last_day_Supply',
'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum',
'fu_end_date_year', 'crcl_cat', 'albumin_cat', 'bilirubin_cat',
'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano',
'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0',
'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed']]
y = data['death'] # Target variable
#Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
#Fitting a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
#Making predictions on the test set
y_pred = model.predict(X_test)
# Printing the results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
Training Accuracy: 0.9742323384152888 Test Accuracy: 0.967381974248927
LOGISTIC REGRESSION
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Selecting required attributes for analysis
X = data[['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat',
'ABIRATERONE', 'enza', 'first_tx_last_day_Supply',
'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum',
'fu_end_date_year', 'crcl_cat', 'albumin_cat', 'bilirubin_cat',
'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano',
'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0',
'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed']] # Selecting features
y = data['death'] # Target variable
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
# Fitting a Logistic Regression model
model = LogisticRegression(max_iter=1000) # You can increase max_iter if convergence issues arise
model.fit(X_train, y_train)
# Making predictions on the test set
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)
# Evaluating the model
test_accuracy = accuracy_score(y_test, y_pred_test)
train_accuracy = accuracy_score(y_train, y_pred_train)
# Printing the results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
# Confusion matrix for the test data
conf_matrix = confusion_matrix(y_test, y_pred_test)
classification_rep = classification_report(y_test, y_pred_test)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)
Training Accuracy: 0.8937083959630664 Test Accuracy: 0.8755364806866953 Confusion Matrix: [[186 86] [ 59 834]] Classification Report: precision recall f1-score support 0 0.76 0.68 0.72 272 1 0.91 0.93 0.92 893 accuracy 0.88 1165 macro avg 0.83 0.81 0.82 1165 weighted avg 0.87 0.88 0.87 1165
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# Initialize a StandardScaler
scaler = StandardScaler()
# Fit and transform the training data, and transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Fit logistic regression with scaled data
log_reg_model = LogisticRegression(max_iter=10000000)
log_reg_model.fit(X_train_scaled, y_train)
# Cross-validation
cross_val_scores = cross_val_score(log_reg_model, X_train_scaled, y_train, cv=5)
print("Cross-validation scores:", cross_val_scores)
print("Mean cross-validation score:", cross_val_scores.mean())
# Predictions on test data
y_pred = log_reg_model.predict(X_test_scaled)
# Evaluate accuracy
train_accuracy = log_reg_model.score(X_train_scaled, y_train)
test_accuracy = log_reg_model.score(X_test_scaled, y_test)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
Cross-validation scores: [0.97532189 0.972103 0.97099893 0.97851772 0.97207304] Mean cross-validation score: 0.973802916242169 Training Accuracy: 0.9742323384152888 Test Accuracy: 0.9682403433476395
log_reg_model = LogisticRegression(solver='saga', max_iter=10000000)
log_reg_model.fit(X_train_scaled, y_train)
LogisticRegression(max_iter=10000000, solver='saga')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=10000000, solver='saga')
from sklearn.model_selection import cross_val_score
# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result( /usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result( /usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result( /usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
Cross-validation scores: [0.8832618 0.89613734 0.88745704 0.89690722 0.89261168] Mean cross-validation score: 0.8912750173296168
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
from sklearn.ensemble import RandomForestClassifier
# Initialize and fit a Random Forest model
rf_model = RandomForestClassifier()
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5)
print(f'Cross-validation scores for Random Forest: {cv_scores_rf}')
print(f'Mean cross-validation score for Random Forest: {cv_scores_rf.mean()}')
Cross-validation scores for Random Forest: [0.9751073 0.97339056 0.97164948 0.97508591 0.96563574] Mean cross-validation score for Random Forest: 0.9721737976195743
from sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'bootstrap': [True, False]
}
# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X, y)
# Best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')
Fitting 5 folds for each of 216 candidates, totalling 1080 fits Best parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300} Best cross-validation score: 0.9730324616904857
the grid search successfully completed, and we found the best parameters for the Random Forest model:
Best Parameters:
bootstrap: True max_depth: 10 min_samples_leaf: 4 min_samples_split: 2 n_estimators: 100 Best Cross-Validation Score: 0.973
RANDOM FOREST
# Train the final Random Forest model with the best parameters
rf_best = RandomForestClassifier(
bootstrap=True,
max_depth=10,
min_samples_leaf=4,
min_samples_split=2,
n_estimators=100,
random_state=42
)
# Fit the model on the training data
rf_best.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf_best.predict(X_test)
# Evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f'Test Accuracy: {test_accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')
Test Accuracy: 0.967381974248927 Confusion Matrix: [[270 2] [ 36 857]] Classification Report: precision recall f1-score support 0 0.88 0.99 0.93 272 1 1.00 0.96 0.98 893 accuracy 0.97 1165 macro avg 0.94 0.98 0.96 1165 weighted avg 0.97 0.97 0.97 1165
The Random Forest model has performed quite well on the test set:
Test Accuracy: 96.7%, indicating the model correctly classified about 97% of the instances. Confusion Matrix: True Negatives (correctly predicted 0): 270 False Positives (incorrectly predicted 1): 2 False Negatives (incorrectly predicted 0): 36 True Positives (correctly predicted 1): 857 Classification Report: Class 0 (Non-survivor): Precision: 0.88 (88% of instances predicted as class 0 were correct) Recall: 0.99 (99% of actual class 0 instances were correctly identified) F1-score: 0.93 (harmonic mean of precision and recall) Class 1 (Survivor): Precision: 1.00 (100% of instances predicted as class 1 were correct) Recall: 0.96 (96% of actual class 1 instances were correctly identified) F1-score: 0.98 The model has a high precision and recall, especially for class 1, suggesting it is very effective at predicting survival. It might be worth exploring feature importance or partial dependence plots to understand which features contribute most to the predictions.
#PREDICTING THE MODEL
import pandas as pd
import numpy as np
# Select a specific row of data (e.g., the first row)
input_data = data.iloc[0, :29].values
# Convert the selected row to a numpy array
input_data_as_numpy_array = np.asarray(input_data)
# Reshape the array for prediction (only one instance)
data_reshaped = input_data_as_numpy_array.reshape(1, -1)
# Make the prediction using the model
prediction = model.predict(data_reshaped)
# Output the prediction result
if prediction[0] == 0:
print('The Person did not die due to prostate cancer')
else:
print('The Person has died due to prostate cancer')
The Person has died due to prostate cancer
/usr/local/lib/python3.10/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names warnings.warn(
# Importing the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Assuming 'data' is the DataFrame that contains the dataset, and 'death' is the target variable
X = data.drop(columns=['death'])
y = data['death']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
# Standardizing the features for KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 1. K-Nearest Neighbors (KNN) Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
# Predictions and Evaluation for KNN
knn_predictions = knn_model.predict(X_test_scaled)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_conf_matrix = confusion_matrix(y_test, knn_predictions)
knn_class_report = classification_report(y_test, knn_predictions)
# Calculate training accuracy
train_predictions = knn_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)
# Calculate test accuracy
test_accuracy = accuracy_score(y_test, knn_predictions)
# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("KNN Model Evaluation")
print("Accuracy:", knn_accuracy)
print("Confusion Matrix:\n", knn_conf_matrix)
print("Classification Report:\n", knn_class_report)
Training Accuracy: 0.9233412067854843 Test Accuracy: 0.8721030042918455 KNN Model Evaluation Accuracy: 0.8721030042918455 Confusion Matrix: [[156 116] [ 33 860]] Classification Report: precision recall f1-score support 0 0.83 0.57 0.68 272 1 0.88 0.96 0.92 893 accuracy 0.87 1165 macro avg 0.85 0.77 0.80 1165 weighted avg 0.87 0.87 0.86 1165
# 2. Naive Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
# Predictions and Evaluation for Naive Bayes
nb_predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_conf_matrix = confusion_matrix(y_test, nb_predictions)
nb_class_report = classification_report(y_test, nb_predictions)
# Calculate training accuracy
train_predictions = nb_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)
# Calculate test accuracy
test_accuracy = accuracy_score(y_test, nb_predictions)
# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("\nNaive Bayes Model Evaluation")
print("Accuracy:", nb_accuracy)
print("Confusion Matrix:\n", nb_conf_matrix)
print("Classification Report:\n", nb_class_report)
Training Accuracy: 0.789778827571398 Test Accuracy: 0.9622317596566523 Naive Bayes Model Evaluation Accuracy: 0.9622317596566523 Confusion Matrix: [[260 12] [ 32 861]] Classification Report: precision recall f1-score support 0 0.89 0.96 0.92 272 1 0.99 0.96 0.98 893 accuracy 0.96 1165 macro avg 0.94 0.96 0.95 1165 weighted avg 0.96 0.96 0.96 1165
/usr/local/lib/python3.10/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but GaussianNB was fitted with feature names warnings.warn(
# Importing the necessary libraries
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
X = data.drop(columns=['death'])
y = data['death']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
# Standardizing the features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# SVM Model with a linear kernel
svm_model = SVC(kernel='linear', C=1.0, random_state=50)
svm_model.fit(X_train_scaled, y_train)
# Predictions and Evaluation for SVM
svm_predictions = svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_conf_matrix = confusion_matrix(y_test, svm_predictions)
svm_class_report = classification_report(y_test, svm_predictions)
# Calculate training accuracy
train_predictions = svm_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)
# Calculate test accuracy
test_accuracy = accuracy_score(y_test, svm_predictions)
# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("SVM Model Evaluation")
print("Accuracy:", svm_accuracy)
print("Confusion Matrix:\n", svm_conf_matrix)
print("Classification Report:\n", svm_class_report)
Training Accuracy: 0.9740176079020829 Test Accuracy: 0.9682403433476395 SVM Model Evaluation Accuracy: 0.9682403433476395 Confusion Matrix: [[272 0] [ 37 856]] Classification Report: precision recall f1-score support 0 0.88 1.00 0.94 272 1 1.00 0.96 0.98 893 accuracy 0.97 1165 macro avg 0.94 0.98 0.96 1165 weighted avg 0.97 0.97 0.97 1165
#@title Convert ipynb to HTML in Colab
# Upload ipynb
from google.colab import files
f = files.upload()
# Convert ipynb to html
import subprocess
file0 = list(f.keys())[0]
_ = subprocess.run(["pip", "install", "nbconvert"])
_ = subprocess.run(["jupyter", "nbconvert", file0, "--to", "html"])
# download the html
files.download(file0[:-5]+"html")
Saving capstone (1).ipynb to capstone (1).ipynb